This is part two in a look at gun violence mass shootings in the US. In this analysis, we will look at the shooting incident data geographically.
library(tidyverse)
library(tidymodels)
library(tidytext)
library(lubridate)
library(scales)
library(janitor)
library(tidymodels)
library(modeltime)
library(tictoc)
library(future)
library(doFuture)
library(tidyquant)
library(timetk)
library(thief)
registerDoFuture()
n_cores <- parallel::detectCores()
plan(strategy = cluster,
workers = parallel::makeCluster(n_cores))
Read in the saved data file.
complete_tbl <- read_rds("gva_data.rds")
top_nine_states <- read_csv("top_nine_states.csv") %>%
pull(state)
## Rows: 9 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, state_code
## dbl (4): n_incidents, n_deaths, n_injuries, incidents_rank
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Here I prepare the primary data set to include some date-related fields for convenience when plotting and summarizing the data.
ms_tbl <- complete_tbl %>%
janitor::clean_names() %>%
select(-operations) %>%
mutate(incident_date = mdy(incident_date),
year = year(incident_date),
month = month(incident_date, label = T, abbr = T),
incident_month = rollforward(incident_date),
dow = wday(incident_date, label = TRUE)) %>%
arrange(incident_date)
Here I create a function to group and summarize the data in different ways since we will need to do this many times later.
group_function <- function(df, ...) {
output <- df %>%
group_by(...) %>%
summarize(n_incidents = n(),
n_deaths = sum(number_killed, na.rm = T),
n_injuries = sum(number_injured, na.rm = T),
.groups = "drop")
return(output)
}
Nesting the time series data
future_date <- 12
# ms_model_data <- ms_tbl %>%
# filter(incident_date < floor_date(Sys.Date(), "month"),
# year(incident_date) >= 2020) %>%
# group_function(incident_month) %>%
# select(-n_injuries, -n_deaths) %>%
# mutate(state = "us")
# #group_by(state) %>%
# #filter(n() > 10) %>%
# #ungroup()
ms_model_state_data <- ms_tbl %>%
filter(incident_date < floor_date(Sys.Date(), "month"),
year(incident_date) >= 2020) %>%
group_function(state, incident_month) %>%
select(-n_injuries, -n_deaths) %>%
complete(state, incident_month, fill = list(n_incidents = 0)) %>%
filter(state %in% top_nine_states)
nested_data_tbl <- ms_model_state_data %>% #ms_model_data %>%
group_by(state) %>%
modeltime::extend_timeseries(.id_var = state,
.date_var = incident_month,
.length_future = future_date) %>%
modeltime::nest_timeseries(.id_var = state,
.length_future = future_date) %>%
modeltime::split_nested_timeseries(.length_test = future_date)
Models
# # MODELING ----
# # * XGBoost Recipe ----
rec_xgb <- recipe(n_incidents ~ ., extract_nested_train_split(nested_data_tbl)) %>%
step_timeseries_signature(incident_month) %>%
step_rm(incident_month) %>%
step_zv(all_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
rec_arima <- recipe(n_incidents ~ ., extract_nested_train_split(nested_data_tbl)) %>%
step_timeseries_signature(incident_month) %>%
step_zv(all_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
# * XGBoost Models ----
wflw_xgb_1 <- workflow() %>%
add_model(boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost")) %>%
add_recipe(rec_xgb)
wflw_xgb_2 <- workflow() %>%
add_model(boost_tree("regression", learn_rate = 0.50) %>% set_engine("xgboost")) %>%
add_recipe(rec_xgb)
# * RandForest Model ----
wflw_rf_1 <- workflow() %>%
add_model(rand_forest("regression") %>% set_engine("ranger")) %>%
add_recipe(rec_xgb)
# * ARIMA Boost
wflw_arima_1 <- workflow() %>%
add_model(arima_boost("regression") %>% set_engine("arima_xgboost")) %>%
add_recipe(rec_arima)
# * Prophet Boost
wflw_prophet_1 <- workflow() %>%
add_model(prophet_boost("regression") %>% set_engine("prophet_xgboost")) %>%
add_recipe(rec_arima)
Test run and error checking
# # 1.0 TRY 1 TIME SERIES ----
# # - Tells us if our models work at least once (before we scale)
try_sample_tbl <- nested_data_tbl %>%
slice(1) %>%
modeltime_nested_fit(
model_list = list(
wflw_xgb_1,
wflw_xgb_2,
wflw_rf_1,
wflw_arima_1,
#wflw_ets_1,
wflw_prophet_1),
control = control_nested_fit(
verbose = TRUE,
allow_par = FALSE))
## ℹ [1/1] Starting Modeltime Table: ID California...
## ✔ Model 1 Passed BOOST_TREE.
## ✔ Model 2 Passed BOOST_TREE.
## ✔ Model 3 Passed RAND_FOREST.
## ✔ Model 4 Passed ARIMA_BOOST.
## ✔ Model 5 Passed PROPHET_BOOST.
## ✔ [1/1] Finished Modeltime Table: ID California
## Finished in: 7.647063 secs.
try_sample_tbl
# * Check Errors ----
try_sample_tbl %>% extract_nested_error_report()
Scale up and process models
# # 2.0 SCALE ----
parallel_start(16)
nested_modeltime_tbl <- nested_data_tbl %>%
modeltime_nested_fit(
model_list = list(
wflw_xgb_1,
wflw_xgb_2,
wflw_rf_1,
wflw_arima_1,
#wflw_ets_1,
wflw_prophet_1),
control = control_nested_fit(
verbose = TRUE,
allow_par = TRUE))
## Using existing parallel backend with 16 clusters (cores)...
## Beginning Parallel Loop | 0.006 seconds
## Finishing parallel backend. Clusters are remaining open. | 28.163 seconds
## Close clusters by running: `parallel_stop()`.
## Finished in: 28.1637 secs.
Error review
# # * Review Any Errors ----
nested_modeltime_tbl %>% extract_nested_error_report()
Accuracy review
# # * Review Test Accuracy ----
nested_modeltime_tbl %>%
extract_nested_test_accuracy() %>%
table_modeltime_accuracy()
Test Forecast against actuals
# # * Visualize Test Forecast ----
nested_modeltime_tbl %>%
extract_nested_test_forecast() %>%
group_by(state) %>%
plot_modeltime_forecast(.facet_ncol = 3)
Select best models
# # 3.0 SELECT BEST ----
nested_best_tbl <- nested_modeltime_tbl %>%
modeltime_nested_select_best(metric = "mae", minimize = TRUE)
Visualize best models
# # * Visualize Best Models ----
nested_best_tbl %>%
extract_nested_test_forecast() %>%
group_by(state) %>%
plot_modeltime_forecast(.facet_ncol = 3)
Model refits
# # 4.0 REFIT ----
nested_best_refit_tbl <- nested_best_tbl %>%
modeltime_nested_refit(
control = control_refit(
verbose = TRUE,
allow_par = TRUE))
## Using existing parallel backend with 16 clusters (cores)...
## Beginning Parallel Loop | 0.008 seconds
## Finishing parallel backend. Clusters are remaining open. | 1.339 seconds
## Close clusters by running: `parallel_stop()`.
## Finished in: 1.339248 secs.
parallel_stop()
Final error review
# # * Review Any Errors ----
nested_best_refit_tbl %>% extract_nested_error_report()
Visualize future forecast
# # * Visualize Future Forecast ----
nested_best_refit_tbl %>%
extract_nested_future_forecast() %>%
group_by(state) %>%
plot_modeltime_forecast(.facet_ncol = 3)